/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser;

import static org.junit.jupiter.api.Assertions.fail;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.apache.commons.io.IOUtils;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.xml.sax.ContentHandler;

import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.sax.ToHTMLContentHandler;
import org.apache.tika.utils.XMLReaderUtils;

/**
 * This tests for XXE in basically xml type files, straight xml and zipped
 * xmls, e.g. ebook and ooxml.
 * It does not test for XXE prevention in files that may contain xml
 * files, such as PDFs and other XMP-containing files.
 */
public class TestXXEInXML extends XMLTestBase {
    //TODO: figure out how to test XFA and xmp in PDFs

    private static final byte[] XXE =
            "<!DOCTYPE roottag PUBLIC \"-//OXML/XXE/EN\" \"file:///couldnt_possibly_exist\">"
                    .getBytes(StandardCharsets.UTF_8);

    @Test
    @Disabled("ignore vulnerable tests")
    public void testConfirmVulnerable() throws Exception {
        try {
            parse("testXXE.xml", getResourceAsStream("/test-documents/testXXE.xml"),
                    new VulnerableSAXParser(), new ParseContext());
            fail("should have failed!!!");
        } catch (FileNotFoundException e) {
            //expected
        }
    }

    @Test
    public void testXML() throws Exception {
        try (InputStream is = getResourceAsStream("/test-documents/testXXE.xml")) {
            parse("testXXE.xml", is, AUTO_DETECT_PARSER, new ParseContext());
        }
    }

    @Test
    public void testInjectedXML() throws Exception {
        byte[] bytes = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><document>blah</document>"
                .getBytes(StandardCharsets.UTF_8);
        byte[] injected = injectXML(bytes, XXE);
        try {
            parse("injected", new ByteArrayInputStream(injected), new VulnerableSAXParser(),
                    new ParseContext());
            fail("injected should have triggered xxe");
        } catch (FileNotFoundException e) {
            //expected
        }
    }

    @Test
    public void test2003_2006xml() throws Exception {
        InputStream is = getResourceAsStream("/test-documents/testWORD_2003ml.xml");
        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        IOUtils.copy(is, bos);
        byte[] injected = injectXML(bos.toByteArray(), XXE);
        parse("testWORD_2003ml.xml", new ByteArrayInputStream(injected), AUTO_DETECT_PARSER,
                new ParseContext());
        is.close();

        is = getResourceAsStream("/test-documents/testWORD_2006ml.xml");
        bos = new ByteArrayOutputStream();
        IOUtils.copy(is, bos);
        injected = injectXML(bos.toByteArray(), XXE);
        parse("testWORD_2006ml.xml", new ByteArrayInputStream(injected), AUTO_DETECT_PARSER,
                new ParseContext());
    }


    @Test
    public void testPOIOOXMLs() throws Exception {
        for (String fileName : new String[]{"testWORD.docx", "testWORD_1img.docx",
                "testWORD_2006ml.docx", "testWORD_embedded_pics.docx", "testWORD_macros.docm",
                "testEXCEL_textbox.xlsx", "testEXCEL_macro.xlsm", "testEXCEL_phonetic.xlsx",
                "testEXCEL_embeddedPDF_windows.xlsx", "testPPT_2imgs.pptx", "testPPT_comment.pptx",
                "testPPT_EmbeddedPDF.pptx", "testPPT_macros.pptm"}) {
            _testPOIOOXMLs(fileName);
        }
    }

    private void _testPOIOOXMLs(String fileName) throws Exception {
        Path injected = null;
        try (TikaInputStream tis = TikaInputStream
                .get(getResourceAsStream("/test-documents/" + fileName))) {
            Path originalOOXML = tis.getPath();
            injected = injectZippedXMLs(originalOOXML, XXE, false);


            ContentHandler xhtml = new ToHTMLContentHandler();
            ParseContext parseContext = new ParseContext();
            //if the SafeContentHandler is turned off, this will throw an FNFE
            Metadata metadata = new Metadata();
            try {
                AUTO_DETECT_PARSER
                        .parse(Files.newInputStream(injected), xhtml, metadata, parseContext);
            } catch (TikaException e) {
                Throwable cause = e.getCause();
                if (!(cause instanceof InvalidFormatException)) {
                    //as of POI 4.1.x
                    fail("POI should have thrown an IFE complaining about " +
                            "not being able to read content types part !");
                }
            } finally {
                Files.delete(injected);
            }

            try {
                metadata = new Metadata();
                xhtml = new ToHTMLContentHandler();

                OfficeParserConfig officeParserConfig = new OfficeParserConfig();
                parseContext.set(OfficeParserConfig.class, officeParserConfig);
                officeParserConfig.setUseSAXDocxExtractor(true);
                officeParserConfig.setUseSAXPptxExtractor(true);
                injected = injectZippedXMLs(originalOOXML, XXE, true);

                AUTO_DETECT_PARSER
                        .parse(Files.newInputStream(injected), xhtml, metadata, parseContext);
            } catch (FileNotFoundException e) {
                e.printStackTrace();
                fail("problem with SAX-based: " + fileName + ": " + e.getMessage());
            } finally {
                Files.delete(injected);
            }
        }
    }

    @Test
    public void testXMLInZips() throws Exception {
        for (String fileName : new String[]{"testEPUB.epub"}) {
            _testXMLInZips(fileName);
        }
    }

    private void _testXMLInZips(String fileName) throws Exception {
        Path injected = null;
        try (TikaInputStream tis = TikaInputStream
                .get(getResourceAsStream("/test-documents/" + fileName))) {
            injected = injectZippedXMLs(tis.getPath(), XXE, false);
        }
        Parser p = AUTO_DETECT_PARSER;
        ContentHandler xhtml = new ToHTMLContentHandler();
        ParseContext parseContext = new ParseContext();
        //if the SafeContentHandler is turned off, this will throw an FNFE
        Metadata metadata = new Metadata();
        try {
            p.parse(Files.newInputStream(injected), xhtml, metadata, parseContext);
        } finally {
            Files.delete(injected);
        }

    }


    @Test
    public void testDOM() throws Exception {
        byte[] bytes = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><document>blah</document>"
                .getBytes(StandardCharsets.UTF_8);
        byte[] injected = injectXML(bytes, XXE);
        for (int i = 0; i < XMLReaderUtils.getPoolSize() * 2; i++) {
            //this shouldn't throw an exception
            XMLReaderUtils.buildDOM(new ByteArrayInputStream(injected), new ParseContext());
        }
    }

    //use this to confirm that this works
    //by manually turning off the SafeContentHandler in SXWPFWordExtractorDecorator's
    //handlePart
    public void testDocxWithIncorrectSAXConfiguration() throws Exception {
        Path injected = null;

        try (TikaInputStream tis = TikaInputStream
                .get(getResourceAsStream("/test-documents/testWORD_macros.docm"))) {
            injected = injectZippedXMLs(tis.getPath(), XXE, true);
        }

        ContentHandler xhtml = new ToHTMLContentHandler();
        ParseContext parseContext = new ParseContext();
        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
        officeParserConfig.setUseSAXDocxExtractor(true);
        parseContext.set(OfficeParserConfig.class, officeParserConfig);
        parseContext.set(SAXParser.class, SAXParserFactory.newInstance().newSAXParser());
        //if the SafeContentHandler is turned off, this will throw an FNFE
        try {
            AUTO_DETECT_PARSER
                    .parse(Files.newInputStream(injected), xhtml, new Metadata(), parseContext);
        } finally {
            //Files.delete(injected);
        }
    }

    @Test
    public void testDOMTikaConfig() throws Exception {
        //tests the DOM reader in TikaConfig
        //if the safeguards aren't in place, this throws a FNFE
        try (InputStream is = getResourceAsStream(
                "/org/apache/tika/config/TIKA-1558-exclude.xml")) {
            ByteArrayOutputStream bos = new ByteArrayOutputStream();
            IOUtils.copy(is, bos);
            byte[] injected = injectXML(bos.toByteArray(), XXE);
            TikaConfig tikaConfig = new TikaConfig(new ByteArrayInputStream(injected));
        }
    }
}
